import pandas as pd
import re
from collections import Counter


def extract_emojis(text):
    if not isinstance(text, str):
        return []
    # 匹配[xxx]格式的表情词
    emoji_pattern = r'\[[\w\s\u4e00-\u9fff]+\]'
    # 匹配Unicode表情
    unicode_emoji_pattern = r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]+'

    emojis = re.findall(emoji_pattern, text)
    unicode_emojis = re.findall(unicode_emoji_pattern, text)
    return emojis + unicode_emojis


# 读取CSV文件
df = pd.read_csv('bili_remark_data.csv', encoding='utf-8')

all_emojis = []
for comment in df['comment']:
    all_emojis.extend(extract_emojis(comment))

# 使用Counter来统计频率
emoji_counter = Counter(all_emojis)

# 将结果写入txt文件
with open('emoji_mappings.txt', 'w', encoding='utf-8') as f:
    for emoji, count in emoji_counter.most_common():
        # 移除方括号并使用下划线分隔词语
        emoji_text = emoji.strip('[]').replace(' ', '_')
        f.write(f"{emoji}: {emoji_text}\n")

print("提取完成，结果已保存到 emoji_mappings.txt 文件中。")